Overview

This document will explore the implementation of various types of plots that can be produced with base R graphics and/or the ggplot2 package. For each plot type we begin by showing basic plots and then enhance the plots by combining plot types, adding new variables using faceting, color, shape or size, or customizing colors, axes, axis labels, annotations or styles.

Load Libraries

library(tidyverse)
library(vioplot)
library(ggmosaic)
library(scales) # provides alpha() function to set alpha for base R plots
library(hexbin)
library(GGally)
library(RColorBrewer)
library(viridis)
library(geojsonio)
library(gridExtra)
library(kableExtra) # For nice looking tables in html output
library(reshape) # For the melt function in heatmap example
library(lubridate)
library(geojsonio)
library(rgdal)
library(rgeos)
library(broom)
library(ggsoccer)

Read in Data Files

MM <- read.csv('data/MarchMadness.csv')
titanic <- read.csv("data/titanic.csv")
nba_players <- read_csv("data/nba_players.csv")
gender_spouse <- read.csv( file = "data/dataset_gender_spouse.csv")
steel <- read.csv("data/steel.csv")
data <- read.csv("data/titanic.csv", TRUE, sep = ",")
spdf <- geojson_read("https://raw.githubusercontent.com/leakyMirror/map-of-europe/master/GeoJSON/europe.geojson", what = "sp")
L <- read.csv('data/phone.csv')
spd <- geojson_read("data/us_states_hexgrid.geojson",  what = "sp")
hockey <- read.csv("data/hockey_search.csv")
card <- read.csv('data/cards.csv')

Histogram

Basic Histogram (Base R)

Use the Iris dataset. Show distribution of petal width.

hist(iris$Petal.Width,
     main = "Iris Petal Width Histogram",
     xlab = "Petal Width")

Customized Histogram using Base R

hist(iris$Petal.Width,                                                
     main = "Iris Petal Width Histogram", xlab = "Petal Width", col='pink', breaks=20, xlim=c(0, 2.9), ylim=c(0, 40))

Creating Histograms for each Species on Petal Width using Base R

#Subsetting the different species
set <- iris$Sepal.Width[iris$Species == "setosa"]
ver <- iris$Sepal.Width[iris$Species == "versicolor"]
vir <- iris$Sepal.Width[iris$Species == "virginica"]
par(mfrow = c(3, 1))
hist(set, main = "Iris Setosa Petal Width", xlab = " Petal Width (Cm)", ylab="Frequency", col='orange', xlim=c(1.5, 6.5))
hist(ver, main = "Iris Versicolor Petal Width", xlab = " Petal Width (Cm)", ylab="Frequency", col='orange', xlim=c(1.5, 6.5))
hist(vir, main = "Iris Virginica Petal Width", xlab = " Petal Width (Cm)", ylab="Frequency", col='orange', xlim=c(1.5, 5.5))

Basic Histogram using GGPlot2

ggplot(iris, aes(x=Petal.Width)) + 
  geom_histogram()

Customized Histogram using GGPlot2

Changing the bins, color, outline color, adding title, and adding x and y labels

ggplot(iris, aes(x=Petal.Width)) + 
  geom_histogram(color="Blue", fill="orange", bins = 20) + 
  ggtitle("Iris Petal Width Histogram") + 
  xlab("Petal Width") + 
  ylab("Frequency")

GGPlot2 Histogram with Species Added in

ggplot(iris, aes(x=Petal.Width)) + 
  geom_histogram(color="yellow", aes(fill=Species), bins = 20) + 
  ggtitle("Iris Petal Width Histogram") + 
  xlab("Petal Width") + 
  ylab("Frequency")

Creating Histograms for each Species on Petal Width using GGPlot2

ggplot(iris, aes(x=Petal.Width)) + 
  geom_histogram(color="blue", fill="orange", bins = 20) + 
  ggtitle("Iris Petal Width Histogram") + 
  xlab("Petal Width") + ylab("Frequency") + 
  facet_grid(. ~ Species)

2d Histogram with Density

Examine seed vs point differential for the NCAA Men’s Basketball Tournament

#this is for more color
rf <- colorRampPalette(rev(brewer.pal(11,'Spectral')))
r <- rf(32)
#this is the actual setup of the plot
## MM is March Madness Data
x <- MM$Pointdiff
y <-  MM$Seed
dataframe <- data.frame(x, y)
h <- hexbin(dataframe)
plot(h, colramp = rf, xlab = "Point Differential", ylab = "Seed")

Let’s look at seed differential vs point differential instead.

#this is for more color
rf <- colorRampPalette(rev(brewer.pal(11,'Spectral')))
#this is the actual setup of the plot
MM <- MM %>% 
  mutate(Seeddiff = abs(Seed - Seed.1))
x <- MM$Seeddiff
y <-  MM$Pointdiff
h<- hexbin(x, y, xbins = 20)
plot(h, colramp = rf, 
     xlab = "Seed Differential", 
     ylab = "Point Differential",
     main = "Seed Differential vs Point Differential")

Try it with ggplot2’s geom_hex()

ggplot(MM, aes(x=Seeddiff, y=Pointdiff) ) +
  geom_hex(bins = 30) +
  scale_fill_continuous(type = "viridis") +
    labs(x = "Seed Differential",
       y = "Point Differential",
       title = "Seed Differential vs Point Differential") +
  theme_bw()

Examine the same relationship with a scatterplot, with regression line added.

fit <- lm(Pointdiff ~ Seeddiff, data = MM)
r2 <- round(summary(fit)$r.squared, 2)
slope <- round(summary(fit)$coefficients["Seeddiff","Estimate"], 2)
ggplot(MM, aes(x=Seeddiff, y=Pointdiff)) +
  geom_point(size = 3, 
             alpha = 0.20, 
             color = "steelblue") +
  geom_smooth(method = lm, 
              se = FALSE, 
              linetype = "solid",
              color = "red") +
  labs(x = "Seed Differential",
       y = "Point Differential",
       title = "Higher Seed Differentials Are Associated with Higher Point Differentials") +
  annotate("text", x = 1, y = 52,
           label = as.expression(substitute(atop(R^2==r2, beta==slope),
                                            list(r2=r2, slope=slope))),
                                 vjust = "middle", hjust = "left") +
  theme_bw()
## Warning in is.na(x): is.na() applied to non-(list or vector) of type
## 'expression'

Take a look at average point differential by seed differential

MM %>% 
  group_by(Seeddiff) %>% 
  summarize(AvgPtDiff = round(mean(Pointdiff), 2)) %>% 
  kbl(align = "l") %>% 
  kable_styling()
Seeddiff AvgPtDiff
0 8.69
1 9.22
2 11.27
3 9.43
4 9.52
5 10.20
6 7.89
7 9.97
8 12.01
9 11.16
10 11.38
11 13.10
12 13.20
13 17.07
15 24.90

Closer examination of seed differential vs point differential

Hmm… It seems like there isn’t much difference in average point differential for seed differentials ranging from 2 to 10. Let’s rerun the scatterplot and regression for that range of seed differentials.

MM_sub <- MM %>% 
  filter(between(Seeddiff, 2, 10))
fit <- lm(Pointdiff ~ Seeddiff, data = MM_sub)
r2 <- round(summary(fit)$r.squared, 2)
slope <- round(summary(fit)$coefficients["Seeddiff","Estimate"], 2)
ggplot(MM_sub, aes(x=Seeddiff, y=Pointdiff)) +
  geom_point(size = 3, 
             alpha = 0.20, 
             color = "steelblue") +
  geom_smooth(method = lm, 
              se = FALSE, 
              linetype = "solid",
              color = "red") +
  labs(x = "Seed Differential",
       y = "Point Differential",
       title = "Moderate Seed Differentials Don't Seem Very Important") +
  annotate("text", x = 2, y = 45,
           label = as.expression(substitute(atop(R^2==r2, beta==slope),
                                            list(r2=r2, slope=slope))),
                                 vjust = "middle", hjust = "left") +
  scale_x_continuous(breaks = 2:10, limits = c(2, 10)) +
  theme_bw()
## Warning in is.na(x): is.na() applied to non-(list or vector) of type
## 'expression'

Density Curve

Basic Density Curve Using Base R

# get a dataset from R
# Orange dataset has information about the growth of orange trees
orange <- Orange
# create a density plot
plot(density(orange$circumference),
     main = "CIRCUMFERENCE OF ORANGE TREES",
     xlab = "CIRCUMFERENCE",
     ylab = "DISTRIBUTION")
polygon(density(orange$circumference), col=7, border=2)

Basic Density Curve Using GGPlot2

# create the density plot
orange %>%
  ggplot( aes(x = circumference)) +
  geom_density(fill= 7, color= 2)

Adding a Density Plot to a Histogram

Use the same histogram created in the “Customized Histogram Using Base R” section and overlay a density curve on it.

x <- iris$Petal.Width
h <- hist(x, breaks=20, col="pink", freq=FALSE, xlab= "Petal Width", main="Histogram With Density Curve")
lines(density(x), lwd=2, col="blue")

density plot with groups

facet(ggplot2)

ggplot(data=diamonds,
       aes(x=price, fill = cut)) +
  geom_density(adjust = 1.5) +
  facet_wrap(~ cut) +
  theme(legend.position = "none")

Bar Plot

###Base R

### Create a data set to work with
survey <- c(apple=45, orange=25, honeydew=35, banana=50, watermelon=20, blueberry=15)
### Create a barchart with titles and color
barplot(survey,
        main="Favorite Fruit Survey",
        xlab="Fruit",
        ylab="Count",
        col=c("red", "orange", "green", "yellow", "pink", "blue"))
        
### Add a legend
legend("topright",
       legend = c("apple", "orange", "honeydew", "banana", "watermelon", "blueberry"),
       col = c("red", "orange", "green", "yellow", "pink", "blue"),
       pch = 19,
       bty = "n",
       pt.cex = 2,
       cex = 1.2)

GGplot2

# Use the same data set created in the base r example but make it into a data frame
survey1 <- data.frame(
  fruit=c("apple", "orange", "honeydew", "banana", "watermelon", "blueberry"),
  value=c(45, 25, 35, 50, 20, 15))
# Create the barchart
ggplot(survey1, aes(x=fruit, y=value)) + geom_bar(stat="identity") 

# Add color with black outlines
ggplot(survey1, aes(x=fruit, y=value)) + geom_bar(stat="identity", fill=c("red", "orange", "green", "yellow", "pink", "blue"), color="black") + labs(x="Count", y="Fruit", title="Fruit Survey") + theme(legend.position = "none")

Bar plot with data added above the bars (Base R)

First, calculate survival percentage and class count for each passenger class, using the aggregate() function, and then merge those two results into a data frame from the titanic dataset titanic.

# calculate survival percentage by class and count by class.
surv_pct <- aggregate(Survived ~ Pclass, data = titanic, FUN = mean)
class_ct <- aggregate(Survived ~ Pclass, data = titanic, FUN = length)
# Merge those two results into a single data frame, and specify 
# column names
by_class_data <- merge(surv_pct, class_ct,
                       by = "Pclass")
colnames(by_class_data) <- c("Pclass", "Surv_Pct", "Count")
#str(by_class_data)

Create the barplot. Note that we increase the ylim parameter a bit to leave room for labels above the bars. We also set axes = FALSE so that the default y-axis is not plotted. Note also that we assign the barplot object to a variable. What gets assigned to the variable is the x positions of the midpoints of each of the bars on the plot. We will use those x positions later to position the labels above the bars, since the x position for a text label specifies the center of the label.

In the next step we create a custom y-axis with the axis() function. The at parameter specifies the placement of the tick marks on the axis (“axis ticks”) and the labels parameter specifies the tick labels. Use the paste() function to make percentage labels.

Finally, we create the labels showing the count of passengers in each class and place them right above the bars. The x position comes from the barplot object and the y position is just the height of the bars plus a little extra so that the text labels don’t overlap the bars.

plt <- barplot(by_class_data$Surv_Pct, 
               names.arg = paste("Class", by_class_data$Pclass),
               col = 'steelblue',
               main = "Survival Percentage by Passenger Class",
               ylab = "Survival Percentage",
               xlab = "Passenger Class",
               ylim = c(0.0, 0.75),
               axes = FALSE
)
# Add y axis
axis(2, at = seq(0.0, 0.7, by = 0.1),
     labels = paste(seq(0, 70, by = 10), "%", sep=""),
     las = 1,
     cex.axis = 0.9)
# Add the text of the labels
text(x = plt, 
     y = by_class_data$Surv_Pct + 0.04, 
     labels = paste("n: ", by_class_data$Count, sep=""), 
     cex=1) 

Bar plot with data added above the bars (ggplot2)

First, calculate survival percentage and class count for each passenger class using titanic dataset using tidyverse approach.

survival <- titanic %>%  
  group_by(Pclass) %>% 
  summarize(Count = n(),
            Surv_Pct = mean(Survived))

Make the plot.

ggplot(survival, aes(x = Pclass, y = Surv_Pct)) +
  geom_col(fill = "steelblue") +
  geom_text(aes(label = paste("n: ", Count, sep="")), vjust = -0.5) +
  xlab("Passenger Class") +
  ylab("Survival Percentage") +
  ggtitle("Survival Percentage by Passenger Class") +
  scale_y_continuous(limits = c(0, 0.70),
                     breaks = seq(0, .7, by = 0.10),
                     labels = paste(seq(0, 70, by = 10), "%", sep=""))  +
  theme_classic() +
  theme(plot.title = element_text(hjust = 0.5)) # Centers title

side-by-side barplots

base r

x <- replicate(4, rnorm(100))
apply(x, 2, mean)
## [1]  0.015167289 -0.143034063 -0.006108029 -0.090066092
barplot(matrix(c(5,3,8,9),nr=2), beside=T, 
        col=c("aquamarine3","coral"), 
        names.arg=LETTERS[1:2])
legend("topleft", c("A","B"), pch=15, 
       col=c("aquamarine3","coral"), 
       bty="n")

ggplot2

Using gender_spouse dataset.

names(gender_spouse)
## [1] "gender" "spouse" "total"
# check the number of students of each gender
##side-by-side bars
ggplot(gender_spouse, aes(y=total, x=gender, fill=spouse))+
  geom_bar(stat="identity" ,position = "dodge")

Circular Bar Plot using GGPlot2

##Creating a general circular barplot
##Creating a data frame
data <- data.frame(
  colors=c("red", "orange", "yellow", "green", "blue", "purple", "pink", "black", "white"), 
  value=c(50, 35, 80, 60, 75, 65, 55, 90, 10)
)
##Making the plot
p <- ggplot(data, aes(x=as.factor(colors), y=value)) + 
##Numeric value in alpha changes how bright or vibrant the color in the graph appears  
 geom_bar(stat="identity", fill=alpha("red", 0.5)) +
##Adding limits
ylim(-120, 150) + theme_minimal() + theme(
  axis.text = element_blank(), 
  axis.title = element_blank(), 
  panel.grid = element_blank(), 
  plot.margin = unit(rep(-2, 4), "cm")) + 
##Making the coordinate polar instead of cartesian 
  coord_polar(start = 0)
p

Creating Circular Bar Plot with Labels

data <- data.frame(
  id=seq(1, 9),
  individual=paste( c("red ", "orange ", "yellow ",
                    "green ", "blue ", "purple ",
                    "pink ", "black ", "white "), seq(1,9), sep=""),
  value=sample( seq(10, 100), 9, replace=T)
)
##Prepping the data to create labels for the graph
# Get the name and the y position of each label
label_data <- data
 
# calculating the angles on the label
number_of_bar <- nrow(label_data)
angle <-  90 - 360 * (label_data$id-0.5) /number_of_bar     # 0.5 is subtracted because the text needs to be on the angle of the center of the bars. 
 
# calculate the alignment of labels: right or left
# If I am on the left part of the plot, my labels have currently an angle < -90
label_data$hjust<-ifelse( angle < -90, 1, 0)
 
# flip angle BY to make them readable
label_data$angle<-ifelse(angle < -90, angle+180, angle)
# ----- ------------------------------------------- ---- #
 
# Creating the plot
p <- ggplot(data, aes(x=as.factor(id), y=value)) +       
  geom_bar(stat="identity", fill=alpha("red", 0.5)) +
  ylim(-100,120) +
  theme_minimal() +
  theme(
    axis.text = element_blank(),
    axis.title = element_blank(),
    panel.grid = element_blank(),
    plot.margin = unit(rep(-1,4), "cm")      
  ) +
  coord_polar(start = 0) +
  
  # Add the labels, using the label_data dataframe that we have created before
  geom_text(data=label_data, aes(x=id, y=value+10, label=individual, hjust=hjust), color="black", fontface="bold",alpha=0.6, size=2.5, angle= label_data$angle, inherit.aes = FALSE ) 
 
p

Line Plot (“Time Series Plot”)

This time series shows the water level in Lake Huron from 1875 to 1972.

The original dataset from R is a time series, and in order to plot it I had to transform to a dataframe and then plot using that instead.

lakehuron <- data.frame(LakeHuron)

ggplot(lakehuron, aes(x=(1875:1972), y= LakeHuron)) +
  geom_line() + 
  ggtitle("WATER LEVELS IN LAKE HURON") +
  xlab("") +
  ylab("WATER LEVEL")
## Don't know how to automatically pick scale for object of type ts. Defaulting to continuous.

Creating a line plot with conditional coloring

The first step is to get the data and create the conditional variable(s). For this plot I will use discharge data from the Youghiogheny River to visualize flood events. What even is a flood? There are many ways that flooding events can be quantified, but among the simplest are to see when discharge exceeds 3 & 7 times the monthly median. Here is how to process the data for that.

# The URL used to query the USGS database 
# includes parameters including data type, site number, and dates
URL = 'https://waterdata.usgs.gov/usa/nwis/dv?cb_00060=on&format=rdb&site_no=03075500&referred_module=sw&period=&begin_date=2018-01-01&end_date=2020-12-31'

# data is displayed with some extra comments so it takes a little additional processing to clean up
d = read.delim(URL, comment.char = '#', stringsAsFactors = F)
d=d[-1,]
d$datetime=as.POSIXct(d$datetime)
colnames(d)[4]='discharge'
d$discharge = as.numeric(d$discharge)

# creating month & year variables for aggregation
d$month=month(d$datetime)
d$year=year(d$datetime)

# creating a data frame of median discharges for each year & then upper thresholds as described by Olden & Poff (2003)

# aggregating for median using unique months & years
aggDischarge=aggregate(data=d, discharge~year+month, FUN=median, na.rm=TRUE)
colnames(aggDischarge)[3]='monthly.median' #rename column

# merging the calculated medians back into the main data frame. all.x=T prevents any of the original data from being removed by allowing monthly medians to be copied as many times necessary 
d=merge(d, aggDischarge, all.x=T)

#identify flood events, upper thresholds are defined as 3 and 7 times monthly median discharge
d$flood.threshold3=d$monthly.median*3
d$flood.threshold7=d$monthly.median*7

# creating a factor variable for each reading indicating if a flood event threshold is exceeded
d$flood.event3=ifelse(d$discharge>d$flood.threshold3,1,0) 
d$flood.event7=ifelse(d$discharge>d$flood.threshold7,1,0)

# if discharge surpasses 7 medians, then logically it will be above 3 medians as well. To create a single variable with three factor levels (non-flood, 3, & 7), we can simply add the flood.event columns together to create values of 0, 1, & 2
d$flood.event=as.factor(d$flood.event3+d$flood.event7)

Plotting the results

There are two important aesthetics to set to get this to work. 1. set color = flood.event This tells R to use the variable to change the line color by the associated value

  1. set group = 1 By default, setting a color separates the factor levels into different lines. This tells R to keep the data together.
ggplot(d, aes(x=datetime, y=discharge,
              color = flood.event, group = 1))+
  labs(title = 'Flood Events of the Youghiogheny River near Oakland, MD', x = 'year', y = 'discharge (cu. ft/sec)',
       color='Flood Event')+
  geom_line(size=0.75)+
  # customize the line colors and legend labels with scale_color_manual()
  scale_color_manual(values=c('black', 'orange', 'red'),
                     labels=c('non-flood', 'moderate', 'heavy'))+
  scale_y_continuous(expand = c(0, 0))+ # removing some empty space between the x axis and title
  theme_classic()

Stacked Area Plot

`steel’ Data is total steel produced annually from the 5 largest steel producing countries Make the plot. geom_area() is the function that produces the stacked area plot.

ggplot(steel, aes(x=year, y=Steel, fill=Country)) + 
    geom_area() + ggtitle("Annual Steel Produced, Top 5 Countries (Million Tons)") + ylab("Steel Produced (Million Tons)") + xlab("Year")

Segmented Stacked Bar Plot

ggplot(steel, aes(fill=Country, y=Steel, x=year)) + 
    geom_bar(position="fill", stat="identity") + ggtitle("Share of Steel Produced Annually") + ylab("% of Steel Produced") + xlab("Year")

Boxplot

In the Boxplot section add two basic boxplot examples, one utilizing base R and one utilizing ggplot2. Note that even basic examples should have appopriate titles and axis labels.

Base R:

# Some data to work with:
data(ChickWeight)
#Boxplot:
boxplot(ChickWeight$weight ~ ChickWeight$Time,
        main = "Chick Age versus Chick Weight",
        xlab = "Chick Age in Days",
        ylab = "Chick Weight in Grams")

ggplot2:

p <- ggplot(ChickWeight, aes(group=Time, y=weight)) + 
  geom_boxplot() +
  labs(title="Chick Age versus Chick Weight",x="Chick Age in Days", y = " Chick Weight in Grams")+
  theme_classic()
p

A Boxplot with Jitter (GGPLOT2)

Using Jitter, you can display the individual data points in each box in a way that is easy to see and understand.

This example is based on one found at the R Graph Gallery website: https://www.r-graph-gallery.com/89-box-and-scatter-plot-with-ggplot2.html

data("ChickWeight")
str(ChickWeight)
## Classes 'nfnGroupedData', 'nfGroupedData', 'groupedData' and 'data.frame':   578 obs. of  4 variables:
##  $ weight: num  42 51 59 64 76 93 106 125 149 171 ...
##  $ Time  : num  0 2 4 6 8 10 12 14 16 18 ...
##  $ Chick : Ord.factor w/ 50 levels "18"<"16"<"15"<..: 15 15 15 15 15 15 15 15 15 15 ...
##  $ Diet  : Factor w/ 4 levels "1","2","3","4": 1 1 1 1 1 1 1 1 1 1 ...
##  - attr(*, "formula")=Class 'formula'  language weight ~ Time | Chick
##   .. ..- attr(*, ".Environment")=<environment: R_EmptyEnv> 
##  - attr(*, "outer")=Class 'formula'  language ~Diet
##   .. ..- attr(*, ".Environment")=<environment: R_EmptyEnv> 
##  - attr(*, "labels")=List of 2
##   ..$ x: chr "Time"
##   ..$ y: chr "Body weight"
##  - attr(*, "units")=List of 2
##   ..$ x: chr "(days)"
##   ..$ y: chr "(gm)"
ChickWeight %>%
  ggplot( aes(x=Diet, y=weight, fill=Diet)) +
    geom_boxplot() +
    scale_fill_viridis(discrete = TRUE, alpha=0.6) +
    geom_jitter(color= "red", size=0.6, alpha=0.9) +
    theme(
      legend.position="none",
      plot.title = element_text(size=14)
    ) +
    ggtitle("Chick Weight by Diet") +
    ylab("Chick Weight in Grams") +
    xlab("Chick Diet")

Violin Plot

Pulling data from OKC, UTA, and CHI from nba_players dataset

team <- nba_players$TEAM[nba_players$TEAM %in% c("OKC", "UTA", "CHI")]
pts <- nba_players$PTS[nba_players$TEAM %in% c("OKC", "UTA", "CHI")]
data <- data.frame(team, pts)

Basic Plot with Base R

with(data , vioplot( 
  pts[team=="OKC"], pts[team=="UTA"], pts[team=="CHI"],  
  col= c(255,234,268) , names=c("OKC","UTA","CHI"),
  xlab = "Teams",
  ylab = "Points",
  main = "NBA Points Per Team",
  ylim = c(0,27),
  border = c(sample(200:900, 3)),
  rectCol = c(893)
))

Basic Plot with ggplot2

I did add a boxplot on top of the violin plot to make it resemble the base r graph.

ggplot(data, aes(x= team, y = pts, fill = team)) + 
  geom_violin(trim = F, show.legend = T) + 
  geom_boxplot(width = 0.1)

Strip Chart

Strip chart in base R

Use the built-in dataset: iris to create a stripchart for the variable Sepal.Width

Here we can use the alpha() function from the scales package to set an alpha (transparency) level for the color, ranging from 0 (totally transparent) to 1 (totally translucent). method - 'jitter' adds random noise in the y-dimension and the jitter() function applied to the variable being plotted adds random noise to the actual values of the variable. amount = 0 causes that random noise to be z/50 where z is the range of the variable.

stripchart(jitter(iris$Sepal.Width, amount = 0),
           main = "Sepal Width Distribution",
           xlab = "Sepal Width (cm)",
           col= alpha("blue", 0.5),
           pch = 19,
           method= 'jitter')

Strip chart in ggplot2

With ggplot2’s geom_jitter() is a shortcut for geom_point(position = "jitter"). Its width and height parameters control the amount of random noise in the x and y dimensions, respectively. With ggplot2 we can set alpha directly without the use of any helper functions. However, with ggplot2 it is a bit tricky to make a strip chart for just one variable (without a second variable for grouping), as we need to set the y variable, set axis limits, and suppress the axis ticks and labels for the unused axis.

ggplot(data = iris) + 
  geom_jitter(aes(x=Sepal.Width, y=0), 
              width = 0.05, 
              height = 0.2, 
              color = "blue",
              alpha = 0.5,
              pch = 19,
              cex = 2) +
  scale_y_continuous(limits = c(-1, 1),breaks = NULL, labels = NULL) + 
  labs(title = "Sepal Width Distribution",
       y = NULL,
       x = "Sepal Width (cm)")

Strip Chart Divided into Groups, Base R

palette(brewer.pal(n = 3, name = "Dark2"))

stripchart(Age ~ Pclass, data = titanic,
           vertical = TRUE,
           method = "jitter",
           pch = 1,
           col = 1:3,
           main = "Titanic Passenger Age by Class",
           xlab = "Class")

Strip Chart Divided into Groups, ggplot2

Here is an example where the ggplot2 plot takes more work than the base R plot. Need to change Pclass to a factor or manipulate the x-axis scale to make sure it only shows 1, 2, and 3. As long as we are changing Pclass to a factor anyway let’s make the labels for the classes “First”, “Second”, and “Third”.

titanic <- titanic %>% 
  mutate(Pclass = factor(Pclass, labels = c("First", "Second", "Third")))

Used the scale_color_brewer() function to set this plot to the same RColorBrewer palette as the base R example.

ggplot(data = titanic, aes(x = Pclass, y = Age, color = Pclass)) +
  geom_jitter(pch = 1, cex = 2, width = 0.12) +
  scale_color_brewer(palette = "Dark2") +
  theme_bw() +
  theme(legend.position = "none") +
  labs(title = "Titanic Passenger Age by Class",
       x = "Class",
       y = "Age")

Enhanced Strip Chart Divided into Groups, ggplot2

This is fairly easy to do with ggplot2, as geom_jitter() by default takes two variables as parameters. One can be the categorical group, iris species.

ggplot(data = iris) + 
### Add Boxplot on top of the strip plot for iris species. - used https://www.youtube.com/watch?v=mbvU8fF4eGM&t=613s as a reference
  geom_boxplot(mapping = aes(x = Sepal.Width, y = Species, fill = Species),
               alpha = 0.25,
               width = 0.5,
               show.legend = FALSE) +
               
  geom_jitter(aes(y=Species, x=Sepal.Width, color = Species), 
              width = 0.05, 
              height = 0.2,
              pch = 19,
              cex = 2) +
  labs(title = "Sepal Width Distribution by Iris Species",
       x = "Sepal Width (cm)",
       y = NULL) + 
  theme_bw() +
  theme(legend.position = "none")

Strip Chart with NBA Data, ggplot2

Strip charts are particularly useful with small datasets. The NBA data has a small number of players per team, so let’s try to use strip charts to compare the distribution of scoring on NBA teams, similar to the comparisons in the violin plot section.

Basic Strip Chart of NBA Scoring with ggplot2

nba_players %>% 
  filter(TEAM %in% c("OKC", "UTA", "CHI")) %>% 
  ggplot(aes(x =PTS, y = TEAM)) + 
    geom_point() +
    labs(title = "Scoring Distribution on Selected NBA Teams")

More Customized Strip Chart of NBA Scoring with ggplot2

top_ten <- nba_players %>% 
  filter(TEAM %in% c("OKC", "UTA", "CHI", "WAS")) %>% 
  group_by(TEAM) %>% 
  arrange(desc(PTS), .by_group = TRUE) %>% 
  mutate(rn = row_number()) %>% 
  filter(rn <= 10)
top_one <- nba_players %>%
  filter(TEAM %in% c("OKC", "UTA", "CHI", "WAS")) %>% 
  group_by(TEAM) %>%
  filter(row_number(desc(PTS)) == 1)
ggplot(data = top_ten, aes(x =PTS, y = TEAM)) + 
  geom_point() +
  scale_x_continuous(breaks = seq(5, 30, 5)) +
  geom_text(aes(label = PLAYER), 
            data = top_one,
            nudge_x = 0.25,
            angle = 90,
            alpha = 0.5,
            vjust = "top", 
            hjust = "middle") +
  labs(title = "Scoring Distribution of Top 10 Scorers") +
  theme_bw() +
  theme(panel.grid.major.y = element_blank()) +
  geom_boxplot(alpha = 0,
               linetype = 3)

Strip Plot showing the distribution of a numberic variable by a categorical variable

This example shows a strip plot to compare chick weights with different types of feed.

#in ggplot2:
ggplot(data = chickwts) + 
  geom_jitter(aes(y=feed, x=weight, color = feed), 
              width = 0.05, 
              height = 0.2, 
              alpha = 0.5,
              pch = 19,
              cex = 2) +
  labs(title = "Chick Weight by Feed Type",
       x = "Chick Weight in Grams",
       y = "Feed Type") + 
  theme_bw() +
  theme(legend.position = "none")

A second example, this time using the Carbon Dioxide Uptake in Grass Plants dataset

ggplot(data = CO2) + 
  geom_jitter(aes(y=Treatment, x=uptake, color = Treatment), 
              width = 0.05, 
              height = 0.2, 
              alpha = 0.5,
              pch = 19,
              cex = 2) +
  labs(title = "Carbon Dioxide Uptake in Grass Plants by Treatment Type",
       x = "Carbon Dioxide Uptake",
       y = "Treatment Type") + 
  theme_bw() +
  theme(legend.position = "none")

Normal Probability (QQ) Plot

A quantile-quantile (“QQ”) plot is used to compare an empirical distribution to a theoretical distribution by plotting the observed quantiles vs the theoretical quantiles. They are typically used to evaluate whether or not a variable is normally distributed.

The built-in rivers dataset gives the lengths (in miles) of 141 rivers in North America, as compiled by the US Geological Survey. Let’s see if the river lengths are normally distributed by creating a normal probability plot of the river lengths.

Basic QQ Plot with Normal Distribution Reference Line (Base R)

Here we create a basic QQ plot in base R and add the normal distribution QQ plot as a reference line for comparison.

qqnorm(rivers,
       main = "Q-Q Plot of North American River Lengths")
qqline(rivers, col = "red")

Basic QQ Plot with Normal Distribution Reference Line (ggplot2)

Here we produce a similar plot using ggplot2. First we need to put the river lengths in a data frame.

rivers_df <- data.frame(length = rivers)

Now we make the plot. Note that the shape and size of the geom are revised to better match the base R plot.

ggplot(rivers_df, aes(sample = length)) +
  stat_qq(shape = 1, cex = 2) +
  stat_qq_line(color = "red") +
  labs(title = "Q-Q Plot of North American River Lengths",
       x = "Theoretical Quantiles",
       y = "Sample Quantiles") +
  theme_classic() +
  theme(plot.title = element_text(hjust = 0.5)) # Centers title

Mosaic Plot

Mosaic Plot with Base R

First, we will mosaic plot of Titanic ticket class vs embarkment location using titanic dataset. First check to see if there is any missing data.

table(titanic$Pclass)
## 
##  First Second  Third 
##    186    173    355
table(titanic$Embarked)
## 
##       C   Q   S 
##   2 130  28 554

Notice that there are two cases for which Embarked is blank. We will remove those cases.

mask <- titanic$Embarked %in% c("C", "Q", "S")
embarked <- titanic$Embarked[mask]
class <- titanic$Pclass[mask]

Make the mosaic plot.

mosaicplot(table(class, embarked),
           main="Passenger Class by Embarkment Site",
              xlab="Class",ylab="Embarkment Site",
              col=c(2,3,4))

Mosaic Plot with ggplot2

Let’s make a similar mosaic plot using ggplot2. We need to use the ggmosaic package to do so.

titanic_clean <- titanic %>% 
  filter(Embarked %in% c("C", "Q", "S")) %>%  # Removes cases with missing data
  mutate(
  Embarked = factor(embarked)
)
titanic_m_data <- titanic_clean %>% 
  count(Embarked, Pclass)
  ggplot(data = titanic_m_data) + 
    geom_mosaic(aes(x = product(Pclass), fill = Embarked, weight = n)) +
    xlab("Passenger Class") + 
    ylab("Embarkment Site") + 
  ggtitle("Passenger Class by Embarkment Site")

Scatterplots

Scatterplots in Base R

Creating a scatterplot in base R is simple. For this example, I used the dataset pennsylvania-history, which shows the history of pennsylvania’s recorded covid cases from January to March 7. The variables of interest are positive, the number of positive cases, and deaths. Did the death count rise as cases grew?

First, we read the data file, pennsylvania-history.csv and store it as covidchart. Our x values in the plot function are covidchart\(positive, and our yvalue is covidchart\)death. Our title, as explaieid in main is “Covid Infection and Fatalities.” Our x-axis is labeled Infections, for covid infections, and our y axis is labeled fatalities. Note that these fatalities are not explciitly confirmed to be from Covid, but are the overall fatality count in PA over time, each point representing a different count of infection and fatality per day. We then draw a regression line based on positive cases and fatalities. The regression line is colored red and the points are colored grey after the color scheme of the virus itself.

covidchart <- read.csv("data/pennsylvania-history.csv")
covidchart <- subset(covidchart, !is.na(positive) & !is.na(death))
plot(covidchart$positive, covidchart$death, main = "Covid Infection and Fatalities", xlab = "Infections", ylab = "Fatalities", col = "grey")
abline(lm(covidchart$death ~ covidchart$positive), col = "red")

We can see that, yes, positive cases unfortunately have a positive correlation with fatalities.

RColorBrewer in Base R

Another way to color your plot is by using rcolorbrewer. We first have to load rcolorbrewer. Note that we will be using an example barplot instead of a scatterplot.

dummy <- data.frame(colno = c("col1", "col2", "col3", "col4", "col5", "col6", "col7", "col8", "col9"), colcount = c(1, 2, 3, 4, 5, 6, 7, 8, 9))

Then, execute this to see all the available palettes.

display.brewer.all()

To view a single palette, type

display.brewer.pal(n, name)

name denotes the palette you want to display, while n denotes how many colors are in that palette. For example, the following code shows you the first four colors in the Pastel1 palette.

display.brewer.pal(4, "Pastel1")

brewer.pal(n, name)

returns the hex codes of the n colors in the named palette as a vector. Here we create a palette of all 9 colors in Pastel1 palette.

Past <- brewer.pal(9, "Pastel1")

From there, go back to your plot’s color argument and use the following code to set the colors of the bars to the first color in the “Past” palette we made with RColorBrewer in the code chunk above.

barplot(height = dummy$colcount, names = dummy$colno, xlab = "colors", ylab = "count", ylim = c(0,9), col = Past[1])

Alternatively, you could use the following code use every color in the palette to color the bars.

barplot(height = dummy$colcount, names = dummy$colno, xlab = "colors", ylab = "count", ylim = c(0,9), col = Past)

Scatterplots with ggplot2

ggplot2 is much the same. we set covidchart as the source of our data and positive and death as our x and y values in aes(). geom_point() is called to make ggplot graph this as a scatterplot. The labs() function labels our x and y axes, and the graph overall. To add a regression line, we use geom_smooth(method = lm).

ggplot(data = covidchart, aes(positive, death)) + geom_point(colour = "red") + labs(x = "Infections", y = "Fatalities", title = "Covid Infection and Fatalities") + geom_smooth(method = lm)

ColorBrewer with ggplot2

In order to use an RColorBrewer palette in a barplot in ggplot2, you have to first save your barplot in a variable, and then append the function scale_fill_brewer() to it, with the argument palette =“palettename” inside the function. Don’t forget to add code setting “fill” equal to your x variables (i.e. your bars), otherwise you’ll get a plain grey bar graph. Done correctly, code should look like this:

ggplot(data = dummy, aes(x = colno, y = colcount, fill = colno)) + 
  geom_bar(stat = "identity") + 
  labs(x = "ColorNumber", y = "ColorCount", title = "Color Dummy Data") +
  scale_fill_brewer(palette = "Pastel1")

Bubble Plot

Base R Bubble Plot

Base R plots require a bit finessing to get just right.

First of all, I plan to place the legend outside the plot itself so I must first change the margins. This is done with par() which can set graphical parameters. Within par(), mar takes a vector input of 4 numbers determining how far into the pane the plot margins should be, starting at the bottom and continuing clockwise. xpd takes a logical value & setting it to TRUE tells R not to clip the legend or any other insertion outside the plot bounds. inset=c(-0.35,0) tells R to move the legend away from the plot, outside the margin horizontally. To make the legend look a bit cleaner, use bty='n' to remove the box around the legend.

To set the point size in base plot, use cex=data$variable. In this case cex=mtcars$disp. For the legend, use pt.cex=c(n1, n2, n3...nx) for however many legend items you have. I’m choosing intervals of 100 from 100 to 400.

par(mar=c(4, 5, 2, 8), xpd = T) # bottom, left, top, right
plot(mtcars$mpg, mtcars$hp, cex = mtcars$disp,
     pch=16,
     main = 'Miles per Gallon vs Gross Horsepower with Displacement',
     xlab = 'miles per gallon',
     ylab = 'gross horspower')
legend('right', inset=c(-0.35,0),
       legend=c('100', '200', '300', '400'),
       title = 'Displacement\n(cu. in.)',
       pch=16, ncol=1, pt.cex = c(100, 200, 300, 400),
       bty = 'n')

Yikes, it seems like the points may be a bit too large. Lets make them all smaller by dividing their values by 100 & take care not to forget about the legend points as well by changing the values for pt.cex.

par(mar=c(4, 5, 2, 8), xpd = T) # bottom, left, top, right
plot(mtcars$mpg, mtcars$hp, cex = mtcars$disp/100,
     pch=16,
     main = 'Miles per Gallon vs Gross Horsepower with Displacement',
     xlab = 'miles per gallon',
     ylab = 'gross horspower')
legend('right', inset=c(-0.35,0),
       legend=c('100', '200', '300', '400'),
       title = 'Displacement\n(cu. in.)',
       pch=16, ncol=1, pt.cex = c(100, 200, 300, 400)/100,
       bty = 'n')

Better, but the points don’t seem to be proportional. It seems that cex scales points by altering the radius of a point. This would cause the area of each consecutive point to be exponentially larger than the last when using the formula \(A=πr^2\). To correct this and make the point size proportional to the area, rearrange the formula to solve for the radius that produces the desired area. \(r=\sqrt{\frac{A}{π}}\)

Now the points are proportional in area but still too large, so I divide the areas by 5.

par(mar=c(4, 5, 2, 8), xpd = T) # bottom, left, top, right
plot(mtcars$mpg, mtcars$hp, cex = sqrt(mtcars$disp/3.14)/5,
     pch=16,
     main = 'Miles per Gallon vs Gross Horsepower with Displacement',
     xlab = 'miles per gallon',
     ylab = 'gross horspower')
legend('right', inset=c(-0.35,0),
       legend=c('100', '200', '300', '400'),
       title = 'Displacement\n(cu. in.)',
       pch=16, ncol=1, pt.cex = sqrt(c(100, 200, 300, 400)/3.14)/5,
       bty = 'n')

Now it looks right!

Ggplot2 Bubble Plot

When I need to make a plot, I usually prefer ggplot() over base r for its simplicity. As you can see, I was able to make a clean graph with just 5 lines of code. To show displacement as a variable point size, you set size=disp within the aesthetics of ggplot. Ggplot automatically scales by area & makes the points reasonable sizes as well. Because mtcars is set as the data source, there is no need to reference the data frame for each variable.

ggplot(mtcars, aes(x=mpg, y=hp, size=disp))+
  geom_point()+
  ggtitle('Miles per Gallon vs Gross Horsepower with Displacement')+
  labs(size = 'Displacement\n(cu. in.)', x='miles per gallon', y='gross horsepower')+
  theme_classic()

Diverging Bar Chart

data(swiss)
#Data Preparation
swiss$`municipality` <- rownames(swiss)  # create new column for municipality names
swiss$Fertility_z <- round((swiss$Fertility - mean(swiss$Fertility))/sd(swiss$Fertility), 2)  # compute normalized fertility
swiss$Fertility_type <- ifelse(swiss$Fertility_z < 0, "below", "above")  # above / below avg flag
swiss <- swiss[order(swiss$Fertility_z), ]  # sort
swiss$`municipality` <- factor(swiss$`municipality`, levels = swiss$`municipality`)  # convert to factor to retain sorted order in plot.
#Diverging Barplot:
ggplot(swiss, aes(x=`municipality`, y=Fertility_z, label=Fertility_z)) + 
  geom_bar(stat='identity', aes(fill=Fertility_type), width=.5) +
  theme(text = element_text(size=9)) +
  scale_fill_manual(name="Fertility", 
                    labels = c("Above Average", "Below Average"), 
                    values = c("above"="#00ba38", "below"="#f8766d")) + 
  labs(title= "Divergence in the Average Fertility in Swiss Municipalities",
       x = "Municipality",
       y = 'Fertility') + 
  coord_flip()

Lollipop Chart

Lollipop Chart with ggplot2

To make a lollipop chart with ggplot2 you can use geom_point() for the circle and geom_segment() for the stem.

let <- data.frame(
  x=LETTERS[1:10],
  y=runif(10, 2, 9)
)
ggplot(let, aes(x=x, y=y)) +
  geom_segment( aes(x=x, xend=x, y=0, yend=y), color="black") +
  geom_point( size=4, color="blue", 
              fill=alpha("purple", 0.2), alpha=.8, shape=21, stroke=4) +
  theme_gray() +
  theme(
    panel.grid.major.x = element_blank(),
    panel.border = element_blank(),
    axis.ticks.x = element_blank()
  ) +
  xlab("Student") +
  ylab("Hours Spent Studying") +
  ggtitle("Student Hours of Study (per Week)") +
  theme(plot.title = element_text(hjust = 0.5)) +
  scale_y_continuous(limits = c(0, 10))

Diverging Lollipop Chart

Use the state data that comes with R. Let’s look at state life expectancy. First, prepare the data that we will use for a couple examples.

state_df <- data.frame(state.x77, Region=state.region, Division=state.division)
state_df <- rownames_to_column(state_df, var="State")
state_df$Life.Exp.Z <- 
  (state_df$Life.Exp - mean(state_df$Life.Exp))/sd(state_df$Life.Exp)
state_df$Above.Avg <- ifelse(state_df$Life.Exp.Z >= 0, "yes", "no")
state_df <- state_df[order(state_df$Life.Exp.Z), ]
state_df$State <- factor(state_df$State, levels = state_df$State)

Next create a simple diverging lollipop chart to highlight the difference of each state’s mean life expectancy from the mean state mean life expectancy.

ggplot(state_df, aes(x=State, y=Life.Exp)) + 
  geom_point(stat='identity', fill="black", size=2)  +
  geom_segment(aes(y = mean(Life.Exp), 
                   x = State, 
                   yend = Life.Exp, 
                   xend = State), 
               color = "black") +
  labs(title="State Life Expectancy", 
       subtitle="Deviation from Mean State Life Expectancy",
       y = "Life Expectancy (Years)",
       x = NULL) + 
  scale_y_continuous(breaks = seq(68, 74, 1)) +
  coord_flip()

Now let’s try to view a subset. With a smaller amount of data we can add the life expectancies as labels on the dots.

ggplot(filter(state_df, Region == "South"), aes(x=State, y=Life.Exp.Z, label=round(Life.Exp, 1))) + 
  geom_hline(yintercept = 0, color = "darkgray") +
  geom_point(stat='identity', aes(col=Above.Avg), size=6)  +
  geom_text(color="white", size=2.2) +
  labs(y = "Standard Deviations from Mean State Life Expectancy") +
  geom_text(aes(x = "West Virginia", 
                y = -0.05, 
                label = "Mean State Life Expectancy"),
            angle = 90,
            color = "darkgray") +
  labs(title="Life Expectancy in the Southern States", 
       subtitle="Compared to Average State Life Expectancy") + 
  coord_flip() +
  theme_bw() +
  theme(axis.title.y=element_blank()) +
  theme(legend.position = "none")

Making pie charts in R and ggplot2

Base R

Using titanic dataset.

slices <- table(titanic$Pclass)
## ^^ we use the table() function to get the count of each class
lbls <- c("1st Class", "2nd Class", "3rd Class")
pct <- round(slices/sum(slices)*100)
lbls <- paste(lbls, pct) #this adds the percentage to the labels
lbls <- paste(lbls,"%",sep="")
pie(slices, labels = lbls, main = "Passenger Class Distribution", col = rainbow(length(lbls)))

Pie Chart Using ggplot2

#ClassCount <- as.data.frame(table(nums$Pclass))
#I tried to get the data from a csv but I was genuinely unable to
ClassCount <- data.frame(
  Class = c("1st", "2nd", "3rd"),
  n = c(186, 173, 355),
  prop = c(26.1, 24.2, 49.7)
)
ClassCount <- ClassCount %>%
  arrange(desc(Class)) %>%
  mutate(lab.ypos = cumsum(prop) -0.5*prop)
## the cumsum(prop) - 0.5*prop helps us put the labels in the center of the portion
mycols <- rainbow(length(ClassCount$Class))
ggplot(ClassCount, aes(x = "", y = prop, fill = Class)) +
  geom_bar(width = 1, stat = "identity", color = "black") +
  coord_polar("y", start = 0) +
  geom_text(aes(y = lab.ypos, label = prop), color = "black") +
  scale_fill_manual(values = mycols) + theme_void()

Chloropleth

Create a chloropleth using color to show population density of European countries. First, download geojson data using the geojson_read() function from the geojsonio package. what = "sp" instructs the function to return the data as a SpatialPolygonsDataFrame type. This type contains vertices data that can be used to draw polygons, as well as additional information associated with the polygon. In this case the polygons are European countries and the associated data includes population and area data from 2005.

We are using the data from “https://raw.githubusercontent.com/leakyMirror/map-of-europe/master/GeoJSON/europe.geojson” stored under spdf

class(spdf)
## [1] "SpatialPolygonsDataFrame"
## attr(,"package")
## [1] "sp"

The SpatialPolygonsDataFrame is implemented as an S4 class in R, so we can extract the data attribute (data associated with the polygons) with @. The data attribute is a data frame.

spdf_data <- spdf@data %>%
  mutate(id = row_number())

To use the geospatial data with ggplot we first need to transform it into a data frame using the tidy() function of the broom package. The strtoi() function here is converting the id field in the resulting dataframe to an integer, so that it is the same type as the id field in spdf_data.

spdf_tibble <- broom::tidy(spdf) %>% 
  mutate(id = strtoi(id))

Merge the geospatial data with the data about the polygons (countries).

# Make the merge
spdf_tibble = spdf_tibble %>%
  left_join(. , spdf_data, by="id")

Plot the countries, with color determined by the country’s population.

ggplot() +
  geom_polygon(data = spdf_tibble, 
               aes(fill = POP2005, 
                   x = long, 
                   y = lat, 
                   group = group), size=0, alpha=0.9) +
  theme_void() +
  coord_map()

Do the same, but use the color palette from the viridis package.

ggplot() +
  geom_polygon(data = spdf_tibble, 
               aes(fill = POP2005, 
                   x = long, 
                   y = lat, 
                   group = group )) +
  scale_fill_viridis() +
  labs(title = "Population") +
  theme_void() +
  coord_map()

Mathematical Annotation in R

Examples

x <- seq(-4, 4, len = 101)
y <- cbind(sin(x), cos(x))
matplot(x, y, type = "l", xaxt = "n",
        main = expression(paste(plain(sin) * phi, "  and  ",
                                plain(cos) * phi)),
        ylab = expression("sin" * phi, "cos" * phi), # only 1st is taken
        xlab = expression(paste("Phase Angle ", phi)),
        col.main = "blue")
axis(1, at = c(-pi, -pi/2, 0, pi/2, pi),
     labels = expression(-pi, -pi/2, 0, pi/2, pi))

## How to combine "math" and numeric variables :
plot(1:10, type="n", xlab="", ylab="", main = "plot math & numbers")
theta <- 1.23 ; mtext(bquote(hat(theta) == .(theta)))
for(i in 2:9)
    text(i,i+1, substitute(list(xi,eta) == group("(",list(x,y),")"),
                           list(x=i, y=i+1)))

## note that both of these use calls rather than expressions.
plot(1:10, 1:10)
text(4, 9, expression(hat(beta) == (X^t * X)^{-1} * X^t * y))
text(4, 8.4, "expression(hat(beta) == (X^t * X)^{-1} * X^t * y)",
     cex = .8)
text(4, 7, expression(bar(x) == sum(frac(x[i], n), i==1, n)))
text(4, 6.4, "expression(bar(x) == sum(frac(x[i], n), i==1, n))",
     cex = .8)
text(8, 5, expression(paste(frac(1, sigma*sqrt(2*pi)), " ",
                            plain(e)^{frac(-(x-mu)^2, 2*sigma^2)})),
     cex = 1.2)

HeatMap Plot

Using Phone data L

L1 <-
  L %>%select(country,X1995,X1996,X1997,X1998,X1999,X2000,X2001,X2002,X2003,X2004,X2005,)
L2 <-
  L1 %>%filter(country=='Argentina'|country=='Brazil'|country=='Chile'|country=='Colombia'|country=='Portugal'|country=='Barbados'|country=='Mexico'|country=='Spain'|country=='Egypt')
for (col in 1:ncol(L2)){
 colnames(L2)[col] <- sub('X', '', colnames(L2)[col])
}
L3<-melt(L2)
plot<-ggplot(L3, aes(variable,country,fill= value)) + geom_tile()
plot1<- plot+xlab('Year')+
 ylab('Country')+
 ggtitle('Mobile Cell Phone Subs 1995-2005')+
 labs(fill = 'Per capita subscriptions')
plot1 + scale_fill_distiller(palette = 'Spectral')

Querying data directly from USGS to use for plot

# The URL used to query the USGS database 
# includes parameters including data type, site number, and dates
URL = 'https://waterdata.usgs.gov/usa/nwis/dv?cb_00060=on&format=rdb&site_no=03049500&referred_module=sw&period=&begin_date=1970-01-01&end_date=2020-12-31'
# data is displayed with some extra comments so it takes a little additional processing to clean up
d = read.delim(URL, comment.char = '#', stringsAsFactors = F)
d=d[-1,]
d$datetime=as.POSIXct(d$datetime)
colnames(d)[4]='discharge'
d$discharge = as.numeric(d$discharge)
# creating month & year variabes for aggregation
d$month=month(d$datetime)
d$year=year(d$datetime)
# aggregating discharge by month & year to return means
aggDischarge=aggregate(discharge~month+year, d, mean)
# plotting just by month, 1 (Jan) is at the bottom and 12 (Dec) is at the top
# there are ways to revserse the order of scales but introducing axis labels complicates that
# I instead am reversing the order of months and using those values for plotting 
aggDischarge$plotMonth = abs(aggDischarge$month - 12) + 1

Creating a heat map of mean daily discharge from 1970-2020

ggplot(aggDischarge, aes(x=year, y=plotMonth, fill=discharge))+ 
  geom_tile()+
  scale_fill_viridis(discrete = F)+
  scale_y_continuous(limits = c(0, 13), # setting limits to 12 causes the top set of tiles to go missing but increasing it to 13 includes them all
                     breaks = seq(1, 12, 1), # one break for each month so none are skipped
                     labels = c('Dec', 'Nov', 'Oct', 'Sep', 'Aug', 'Jul',
                                'Jun', 'May', 'Apr', 'Mar', 'Feb', 'Jan'),
                     expand = c(0, 0))+ # removing some empty space above the x axis and below the title
  labs(x = 'year', y = 'month', fill = 'Mean Daily\nDischarge\ncu. ft/sec', 
       title = 'Mean Daily Discharge of the Allegheny River at Natrona, PA')+
  theme_classic()

Plots side by side - gridExtra Package - grid.arrange()

Resources used https://www.youtube.com/watch?v=GVWSGr4OLj8&list=PLhUoWCH9bWbxje8xauO7wagHzYJclj9cx&index=7

x = c(1, 2, 3, 4, 5, 6, 7)
y = c(8, 9, 10, 11, 12, 13, 14)
data1 = data.frame(x, y)
pointex <- ggplot(data1, aes(x = x, y = y)) +
  geom_point() +
  labs(x = "examplelength",
       y = "examplewidth")
pointex

nums = c(3, 8, 4, 5, 0.2, 0.5, 20)
int = c(0, 3, 9, 5, 1.5, 15, 5.3)
data2 = data.frame(nums, int)
pointex2 <- ggplot(data2, aes(x = nums, y= int)) +
  geom_point()+
  labs(x="miles",
       y="hours")
pointex2

plotex <- list(pointex, pointex2)
grid.arrange(grobs = plotex, ncol = 2, top= "Main Title")

Stacked Bar plot

Base R

fishy <- read.table(text ="Butterfish Cockle Swordfish Tuna
1 59 49 18 24 
2 32 24 71 31", header= TRUE)
  
barplot(as.matrix(fishy),col=c("blue","purple"))

GGplot

fish <- read.csv('data/Pescado.csv')
names(fish)
## [1] "X"       "Species" "Sex"     "Total"   "X.1"     "X.2"
ggplot(fish, aes(y=Total, x=Species, fill=Sex))+
  geom_bar(stat='identity')

##Hexbin Map

Using spd dataset

# Bit of reformating
spd@data = spd@data %>%
  mutate(google_name = gsub(" \\(United States\\)", "", google_name))
# Show it
plot(spd)

spd@data = spd@data %>% mutate(google_name = gsub(" \\(United States\\)", "", google_name))
spd_fortified <- tidy(spd, region = "google_name")
# Calculate the centroid of each hexagon to add the label:
centers <- cbind.data.frame(data.frame(gCentroid(spd, byid=TRUE), id=spd@data$iso3166_2))
 
# Now I can plot this shape easily as described before:
ggplot() +
  geom_polygon(data = spd_fortified, aes( x = long, y = lat, group = group), fill="skyblue", color="white") +
  geom_text(data=centers, aes(x=x, y=y, label=id)) +
  theme_void() +
  coord_map()

Using hockey dataset

spd_fortified <- spd_fortified %>%
  left_join(. , hockey, by=c("id"="State")) 
 
# Make a first chloropleth map
ggplot() +
  geom_polygon(data = spd_fortified, aes(fill =  Searches, x = long, y = lat, group = group)) +
  geom_text(data=centers, aes(x=x, y=y, label=id)) +
  scale_fill_gradient(trans = "log") +
  theme_void() +
  coord_map() + 
  ggtitle("Google Search Popularity for Hockey by State")

options(scipen=999)
#this turns the scientific notation off
theme_set(theme_bw())
#^ this helps get a good theme for this graph
data("midwest", package = "ggplot2")

# Scatterplot with line of best fit
gg <- ggplot(midwest, aes(x=area, y=poptotal)) + 
  geom_point(aes(col=state, size=popdensity)) + 
  geom_smooth(method="loess", se=F) +    #this gets us the line of best fit
  xlim(c(0, 0.1)) + 
  ylim(c(0, 500000)) + 
  labs(y="Population", 
       x="Area", 
       title="Area Vs Population", 
       caption = "Source: midwest")

plot(gg)
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 15 rows containing non-finite values (stat_smooth).
## Warning: Removed 15 rows containing missing values (geom_point).

Scatter plot

basket <- nba_players

ggplot(data = basket, aes(x = AGE, y = PTS)) +
  geom_point(alpha = 1, aes(color = PTS))+
  geom_density_2d()+
  ggtitle('NBA Player Age to Points Made')

Dot Plot : shooting map in a soccer field

###I used a package called “ggsoccer”, and you can build a soccer pitch with this package. you can build the soccer pitch from scratch like I did before knowing ggsoccer, but it takes a lot of time and you might have mistakes because it is long

ggplot() +
  annotate_pitch() +
  theme_pitch()

### this is how you can build a soccer pitch from scratch   
  soccer_ns <- ggplot()+
  geom_rect(aes(xmin = -10, xmax = 100, ymin= -10, ymax = 130), fill = "#669933", colour = "#FFFFFF", size = 0.5)+ ### Background outline
  geom_rect(aes(xmin = 0, xmax = 90, ymin= 0, ymax = 120), fill = "#669933", colour = "#FFFFFF", size = 0.5)+ ###Sideline, Endline
  geom_rect(aes(xmin = 0, xmax = 90, ymin= 0, ymax = 60), fill = NA , colour = "#FFFFFF", size = 0.5)+ #Halfway
  geom_rect(aes(xmin = 24.85, xmax = 65.15, ymin= 0, ymax = 16.5), fill = "#669933", colour = "#FFFFFF", size = 0.5)+ #lower 18 yard box
  geom_rect(aes(xmin = 35.85, xmax = 54.15, ymin= 0, ymax = 5.5), fill = "#669933", colour = "#FFFFFF", size = 0.5)+ # Lower 5 yard box
  geom_rect(aes(xmin = 24.85, xmax = 65.15, ymin= 103.5, ymax = 120), fill = "#669933", colour = "#FFFFFF", size = 0.5)+ #upper 18 yard box
  geom_rect(aes(xmin = 35.85, xmax = 54.15, ymin= 114.5, ymax = 120), fill = "#669933", colour = "#FFFFFF", size = 0.5)+ #upper 5 yard box
  geom_curve(aes(x = 35.85, y = 16.5, xend = 54.15, yend = 16.5), curvature = -0.6, colour = "#FFFFFF")+ #lower D
  geom_curve(aes(x = 35.85, y = 103.5, xend = 54.15, yend = 103.5), curvature = .6, colour = "#FFFFFF")+ #upper D
  ggforce::geom_circle(aes(x0=45, y0=60, r= 9.15), colour = "#FFFFFF")+ ##Halfway circle
  theme(rect = element_blank(),
        line = element_blank(),
        text = element_blank())

### By default the pitch is white and grey, so we need to change the pitch color from white to green and the lines from grey to white 

ggplot() +
  annotate_pitch(colour = "white",
                 fill = "#3ab54a") +
  theme_pitch() +
  theme(panel.background = element_rect(fill = "#3ab54a")) 

###And here I created a shot map that basically shows the shots on target that it is shown in orange and the off target shots are shown in blue.

# Data
set.seed(1)
df <- data.frame(x = rnorm(20, 80, 10), 
                 y = rnorm(20, 50, 20),
                 Shot = sample(c("In", "Out"),
                               40, replace = TRUE))

ggplot(df) +
  annotate_pitch(colour = "white",
                 fill = "#3ab54a") +
  geom_point(aes(x = x, y = y, fill = Shot),
             shape = 21,
             size = 4) +
  coord_cartesian(xlim = c(45, 105))+ 
  theme_pitch() +
  theme(panel.background = element_rect(fill = "#3ab54a")) 

Basic Doughnut Chart with ggplot

# Create test data
fruit_survey <- data.frame(
  fruit=c("Apple", "Banana", "Honeydew", "Watermelon"),
  count=c(15, 25, 20, 40)
)

# Compute percentages
fruit_survey$fraction = fruit_survey$count / sum(fruit_survey$count)

# Compute the cumulative percentages (top of each rectangle)
fruit_survey$ymax = cumsum(fruit_survey$fraction)

# Compute the bottom of each rectangle
fruit_survey$ymin = c(0, head(fruit_survey$ymax, n=-1))

# Make the plot
ggplot(fruit_survey, aes(ymax=ymax, ymin=ymin, xmax=4, xmin=3, fill=fruit)) +
  geom_rect() +
  coord_polar(theta="y") + 
  xlim(c(2, 4))

Doughnut Chart with Base R

It is possible to build a doughnut chart in base R; however, it requires a very long ‘doughnut’ function. The function is included below, and the additional code to create an actual chart. The example is taken from https://www.r-graph-gallery.com/130-ring-or-donut-chart.html. It is suggested to just use ggplot whenever you want to create this kind of chart.

doughnut <-
function (x, labels = names(x), edges = 200, outer.radius = 0.8,
          inner.radius=0.6, clockwise = FALSE,
          init.angle = if (clockwise) 90 else 0, density = NULL,
          angle = 45, col = NULL, border = FALSE, lty = NULL,
          main = NULL, ...)
{
    if (!is.numeric(x) || any(is.na(x) | x < 0))
        stop("'x' values must be positive.")
    if (is.null(labels))
        labels <- as.character(seq_along(x))
    else labels <- as.graphicsAnnot(labels)
    x <- c(0, cumsum(x)/sum(x))
    dx <- diff(x)
    nx <- length(dx)
    plot.new()
    pin <- par("pin")
    xlim <- ylim <- c(-1, 1)
    if (pin[1L] > pin[2L])
        xlim <- (pin[1L]/pin[2L]) * xlim
    else ylim <- (pin[2L]/pin[1L]) * ylim
    plot.window(xlim, ylim, "", asp = 1)
    if (is.null(col))
        col <- if (is.null(density))
          palette()
        else par("fg")
    col <- rep(col, length.out = nx)
    border <- rep(border, length.out = nx)
    lty <- rep(lty, length.out = nx)
    angle <- rep(angle, length.out = nx)
    density <- rep(density, length.out = nx)
    twopi <- if (clockwise)
        -2 * pi
    else 2 * pi
    t2xy <- function(t, radius) {
        t2p <- twopi * t + init.angle * pi/180
        list(x = radius * cos(t2p),
             y = radius * sin(t2p))
    }
    for (i in 1L:nx) {
        n <- max(2, floor(edges * dx[i]))
        P <- t2xy(seq.int(x[i], x[i + 1], length.out = n),
                  outer.radius)
        polygon(c(P$x, 0), c(P$y, 0), density = density[i],
                angle = angle[i], border = border[i],
                col = col[i], lty = lty[i])
        Pout <- t2xy(mean(x[i + 0:1]), outer.radius)
        lab <- as.character(labels[i])
        if (!is.na(lab) && nzchar(lab)) {
            lines(c(1, 1.05) * Pout$x, c(1, 1.05) * Pout$y)
            text(1.1 * Pout$x, 1.1 * Pout$y, labels[i],
                 xpd = TRUE, adj = ifelse(Pout$x < 0, 1, 0),
                 ...)
        }
        ## Add white disc          
        Pin <- t2xy(seq.int(0, 1, length.out = n*nx),
                  inner.radius)
        polygon(Pin$x, Pin$y, density = density[i],
                angle = angle[i], border = border[i],
                col = "white", lty = lty[i])
    }

    title(main = main, ...)
    invisible(NULL)
}


# inner.radius controls the width of the ring
doughnut( c(3,5,9,12) , inner.radius=0.5, col=c(rgb(0.2,0.2,0.4,0.5), rgb(0.8,0.2,0.4,0.5), rgb(0.2,0.9,0.4,0.4) , rgb(0.0,0.9,0.8,0.4)) )
## Warning in rep(lty, length.out = nx): 'x' is NULL so the result will be NULL
## Warning in rep(density, length.out = nx): 'x' is NULL so the result will be NULL

After researching doughnut charts on several sites, I learned that they, along with pie charts are not very well liked by professionals. It was mentioned that readers have a hard time with reading angles, so charts like bar charts and histograms are more easily understood.

Parallel Coordinate Plot

Working with large data set

ggparcoord(data = nba_players,
           columns = 4:8,
           groupColumn = 'TEAM',
           scale = 'globalminmax',
           #scale = 'center',
           order = 'skewness',
           alphaLines = .3)+
   ggtitle('NBA AVG Distribution')

# Working with small data set

names(card)
## [1] "face"  "suit"  "value"
ggparcoord(data = card,
           columns = 1:3,
           alphaLines = .3,
           boxplot = TRUE,
           showPoints = TRUE)+
  ggtitle('Playing Card Distribution')